import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
def lgb_model_init(x_train,y_train):
lgb_clf=LGBMClassifier()
lgb_clf.fit(x_train,y_train)
return lgb_clf
def lgb_model_tuned(x_train,y_train):
grid_search = {'num_leaves': [11,21,31],
'max_depth': [5,8,10],
'min_data_in_leaf': [10, 15, 20],
'learning_rate': [0.1,0.003,0.001],
'n_estimators': [50,100]}
clf = LGBMClassifier()
grid = GridSearchCV(estimator = clf, param_grid = grid_search,
cv = 4, verbose= 5, n_jobs = -1)
grid.fit(x_train,y_train)
lgb_model=grid.best_estimator_
return lgb_model
data = pd.read_csv('preprocessing1.csv', encoding = 'cp949', index_col=0 )
seed = 5764
target=data['Status']
data.drop(['Status'],axis=1,inplace = True)
# 학습, 테스트 데이터 분리 (0.7:0.3)
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=seed)
lgb_model = lgb_model_tuned(x_train,y_train)
Fitting 4 folds for each of 162 candidates, totalling 648 fits [LightGBM] [Warning] min_data_in_leaf is set=10, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=10
import numpy as np
from lime import lime_tabular
lgb_lime_explainer = lime_tabular.LimeTabularExplainer(np.array(x_train),feature_names = data.columns , mode="classification")
for i in range(0,10):
lgb_lime_explanation = lgb_lime_explainer.explain_instance(x_test.iloc[i], lgb_model.predict_proba)
lgb_lime_explanation.show_in_notebook(show_table=True)
print(i)
0
1
2
3
4
5
6
7
8
9
import shap
lgb_shap_explainer = shap.TreeExplainer(lgb_model)
shap_values = lgb_shap_explainer.shap_values(x_test)
LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
shap_values0 = shap_values[0]
shap_values1 = shap_values[1]
pd.DataFrame(shap_values)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.060913 | 5.938066 | 0.033142 | 1.337879 | -0.027559 | -0.106786 | 0.021367 | -0.001228 | 0.037628 | -0.039497 | ... | 0.018534 | -0.067606 | -0.011855 | 0.000306 | 0.0 | 0.0 | 0.0 | 0.000042 | 0.0 | 0.0 |
| 1 | -0.100589 | 1.817809 | 0.293854 | 1.327279 | -0.018856 | -0.039895 | -0.043687 | -0.000038 | -0.111681 | -0.013843 | ... | -0.016512 | 0.000143 | 0.027499 | -0.002004 | 0.0 | 0.0 | 0.0 | -0.000036 | 0.0 | 0.0 |
| 2 | 0.031435 | -0.244622 | 0.277961 | -2.480536 | -0.019976 | -0.036943 | -0.047571 | -0.000558 | -0.066861 | -0.110250 | ... | -0.003020 | 0.000013 | -0.062603 | 0.000276 | 0.0 | 0.0 | 0.0 | -0.000036 | 0.0 | 0.0 |
| 3 | -0.027382 | -1.116766 | 0.089582 | 0.444481 | -0.035768 | -0.047052 | 0.044000 | -0.000556 | -0.043766 | -0.185097 | ... | 0.011086 | 0.000013 | -0.003684 | 0.000306 | 0.0 | 0.0 | 0.0 | -0.000023 | 0.0 | 0.0 |
| 4 | -0.127619 | -1.709845 | -0.031520 | 0.342881 | -0.082172 | -0.113986 | -0.047293 | -0.000097 | -0.096669 | -0.116064 | ... | 0.003405 | 0.000013 | -0.005900 | 0.000394 | 0.0 | 0.0 | 0.0 | -0.000070 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 43190 | -0.068260 | -0.434048 | 0.431488 | 0.749769 | -0.040857 | -0.056899 | -0.025563 | -0.000027 | -0.096319 | -0.131242 | ... | 0.016858 | -0.067606 | 0.002202 | 0.000111 | 0.0 | 0.0 | 0.0 | 0.000043 | 0.0 | 0.0 |
| 43191 | -0.028327 | -1.323152 | -0.603167 | 0.558277 | -0.094224 | -0.143546 | 0.021240 | -0.000075 | 0.405080 | -0.057729 | ... | 0.003336 | -0.067606 | -0.043265 | 0.000298 | 0.0 | 0.0 | 0.0 | 0.000042 | 0.0 | 0.0 |
| 43192 | 0.003222 | -1.274914 | -0.733685 | 0.548409 | -0.085359 | -0.148556 | 0.015578 | 0.000037 | -0.011205 | -0.073217 | ... | 0.001393 | -0.067606 | -0.052286 | 0.000405 | 0.0 | 0.0 | 0.0 | -0.000035 | 0.0 | 0.0 |
| 43193 | -0.139284 | -0.600632 | -0.267923 | -2.245942 | -0.014664 | -0.049018 | -0.051033 | -0.001116 | -0.055401 | -0.116513 | ... | -0.020422 | -0.067606 | 0.079443 | -0.001919 | 0.0 | 0.0 | 0.0 | -0.000070 | 0.0 | 0.0 |
| 43194 | -0.033462 | -1.149843 | 0.330163 | 0.162047 | -0.023018 | -0.037735 | -0.065641 | 0.000041 | -0.042708 | -0.099078 | ... | -0.006054 | -0.067606 | 0.040033 | -0.001517 | 0.0 | 0.0 | 0.0 | -0.000036 | 0.0 | 0.0 |
43195 rows × 63 columns
shap.summary_plot(shap_values1, x_test)
shap.dependence_plot('rate_of_interest', shap_values1, x_test)
shap.dependence_plot('Upfront_charges', shap_values1, x_test)